Preparation of data for modeling
There will be three main steps:
- retrieve date of interpellation since it was available in
metadata
- make sure author is ok - it is needed for proper network
analysis
- clean stemmed data
Load data
First step - load data from preprocessed files
pi_files <- list.files("data", pattern = "^(pi_)")
all_pi <- do.call(rbind,
lapply(X = pi_files, FUN = function(i){
mydb <- dbConnect(RSQLite::SQLite(), file.path("data", i))
this_cont <- dbGetQuery(conn = mydb,
statement = "select metadata.*, ipcontent.content from ipcontent left join metadata on metadata.id = ipcontent.id")
dbDisconnect(mydb)
this_cont$period <- gsub("pi_", replacement = "", x = gsub(pattern = ".sqlite", replacement = "", x = i))
this_cont
}))
# add information about length of each document
all_pi <- all_pi %>%
mutate(len = nchar(CONTENT))
# create dataset with start and end of each period
period_bands <- data.frame(period = c("1997-2001", "2001-2005", "2005-2007"),
start_date = as.Date(c("1997-10-20", "2001-10-19", "2005-10-19")),
end_date = as.Date(c("2001-10-18", "2005-10-18", "2007-09-07")))
Create date of interpellation
Second step - create date of creation of each interpellation
# assumption - each document ends in date passed in that format: <day> <month as a word> <year>
last_part_len <- 24
vec_to_extract_dates <- all_pi$CONTENT %>% substr(start = nchar(.) - last_part_len, nchar(.))
# there are some cases that will be overwritten and not treated with general approach (59 out of 28074 [0.2%]):
# because the last part of document did not include date (e.g. it consists of footnotes of some sort)
# because there were mistakes (e.g. year 3000)
list_exceptions <- jsonlite::fromJSON(txt = "dicts/pi_dates.json")
vec_to_extract_dates[match(x = names(list_exceptions), table = all_pi$DOC)] <- unlist(list_exceptions)
# remove last part of date
vec_to_extract_dates <- gsub(pattern = "(r|roku)\\s*\\.*\\,*$", replacement = "", x = vec_to_extract_dates)
vec_to_extract_dates <- trimws(vec_to_extract_dates)
# extract year
years <- as.numeric(substr(vec_to_extract_dates, regexpr(pattern = "\\d+\\.*$", text = vec_to_extract_dates), nchar(vec_to_extract_dates)))
# check if everything is ok
if (any(is.na(years))) {
idx <- which(is.na(years))
message(length(idx))
vec_to_extract_dates[idx[1:min(10, length(idx))]]
}
if (!all(years %in% c(1997:2007))) {
idx <- which(!years %in% c(1997:2007))
message(length(idx))
vec_to_extract_dates[idx[1:min(10, length(idx))]]
}
# remove year to ease the process
vec_to_extract_dates <- trimws(substr(vec_to_extract_dates, 1, nchar(vec_to_extract_dates) - 4))
# extract month
months_vec <- rep(0, length = length(vec_to_extract_dates))
months_dict <- list("1" = "stycznia",
"2" = c("lutego", "luty"),
"3" = "marca",
"4" = c("kwietnia", "kwietnie", "kwitnia", "kwietna"),
"5" = "maja",
"6" = "czerwca",
"7" = "lipca",
"8" = "sierpnia",
"9" = c("września", "wrzesień"),
"10" = "października",
"11" = c("listopada", "listopad"),
"12" = "grudnia")
for (m in 1:length(months_dict)) {
vals <- regexpr(pattern = sprintf("(%s)", paste0(months_dict[[m]], collapse = "|")), text = vec_to_extract_dates)
idx <- which(vals > 0)
months_vec[idx] <- as.numeric(names(months_dict)[m])
vec_to_extract_dates[idx] <- substr(vec_to_extract_dates[idx], 1, vals[idx] -1)
}
# check if everything is ok
if (!all(months_vec %in% c(1:12))) {
idx <- which(!months_vec %in% c(1:12))
message(length(idx))
vec_to_extract_dates[idx[1:min(10, length(idx))]]
}
vec_to_extract_dates <- trimws(vec_to_extract_dates)
# extract day
days_vec <- as.numeric(substr(vec_to_extract_dates, regexpr(pattern = "\\d+$", text = vec_to_extract_dates), nchar(vec_to_extract_dates)))
# check if everything is ok
if (!all(days_vec %in% c(1:31))) {
idx <- which(!days_vec %in% c(1:31))
message(length(idx))
vec_to_extract_dates[idx[1:min(10, length(idx))]]
}
# add to dataset
all_pi$date <- as.Date(sprintf("%s-%02d-%02d", years, months_vec, days_vec))
Check if all is ok - each date should be between start and end of
each period
all_pi %>%
left_join(y = period_bands, by = "period") %>%
mutate(days_after_start = as.numeric(date - start_date)) %>%
filter(days_after_start < 0) %>%
kable() %>%
kable_styling()
|
ID
|
AUTHOR
|
DOC
|
CONTENT
|
period
|
len
|
date
|
start_date
|
end_date
|
days_after_start
|
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
–:
|
:——
|
:—
|
:——-
|
:——
|
—:
|
:—-
|
:———-
|
:——–
|
—————-:
|
But sometimes interpellation is written in a period
t is delivered to Sejm in period
t-1.
all_pi %>%
left_join(y = period_bands, by = "period") %>%
mutate(days_before_end = as.numeric(end_date - date)) %>%
filter(days_before_end < 0) %>%
summarise(n(), min(days_before_end), median(days_before_end), max(days_before_end)) %>%
kable() %>%
kable_styling()
|
n()
|
min(days_before_end)
|
median(days_before_end)
|
max(days_before_end)
|
|
31
|
-172
|
-11
|
-1
|
Clean up authors
Third step - cleaning up data regarding author of interpellation
all_pi %>% group_by(period) %>%
summarise(n_authors = length(unique(AUTHOR))) %>%
kable() %>%
kable_styling()
|
period
|
n_authors
|
|
1997-2001
|
559
|
|
2001-2005
|
687
|
|
2005-2007
|
686
|
This is strange, those supposed to be parliamentary interpellations.
In Polish Sejm there is 460 members, but data for each term suggest
there are many more of them.
Couple things to consider:
- member of parliament was sometimes described with first and second
name (‘Adam Bielan’ and ‘Adam Jerzy Bielan’ is the same person)
- couple of members cosign one interpellation (e.g. ‘Adam Bielan i
Zbigniew Ziobro’)
- in course of term member’s list might change due to different things
(death, resigning etc.)
Load the member list. This is a list of member for each term at the
end of the term.
# for further analysis
posl_json <- jsonlite::fromJSON(txt = "dicts/poslowie.json")
posl_df <- do.call(rbind,
lapply(names(posl_json),
FUN = function(i){
posl_json_period <- posl_json[[i]]
posl_df <- do.call(rbind,
lapply(1:length(posl_json_period),
FUN = function(j){
nn <- names(posl_json_period)[j]
ps <- unlist(posl_json_period[[nn]])
data.frame(AUTHOR = ps, ugr = names(posl_json_period)[j])
}))
row.names(posl_df) <- NULL
posl_df$period <- i
posl_df
}))
Is everything is ok with that data?
posl_df %>%
group_by(period) %>%
summarise(n = n(), uq = length(unique(AUTHOR)), any_duplicates = n > uq, members_dup = paste0(AUTHOR[duplicated(AUTHOR)], collapse = ", ")) %>%
kable() %>%
kable_styling()
|
period
|
n
|
uq
|
any_duplicates
|
members_dup
|
|
1997-2001
|
460
|
459
|
TRUE
|
Maciej Jankowski
|
|
2001-2005
|
460
|
459
|
TRUE
|
Ewa Janik
|
|
2005-2007
|
460
|
460
|
FALSE
|
|
There are duplicates in two periods. Let see how many interpellations
are for those members.
all_pi %>% filter(regexpr(text = AUTHOR, pattern = "(Ewa Janik)|(Maciej Jankowski)") > 0) %>%
group_by(period, AUTHOR) %>% summarise(n()) %>%
kable() %>%
kable_styling()
|
period
|
AUTHOR
|
n()
|
|
1997-2001
|
Ewa Janik
|
29
|
|
1997-2001
|
Ewa Janik i Seweryn Kaczmarek
|
2
|
|
1997-2001
|
Ewa Janik i Władysław Szkop
|
2
|
|
1997-2001
|
Maciej Jankowski
|
2
|
|
2005-2007
|
Ewa Janik
|
3
|
Apparently there are only 2 (out of 28k) observations are for the
periods where there are mulitple names on member list. For simplicity
I’ll remove one of those members.
posl_df <- posl_df %>% filter(!(AUTHOR == "Maciej Jankowski" & ugr == "Posłowie niezrzeszeni" & period == "1997-2001"))
posl_df <- posl_df %>% filter(!(AUTHOR == "Ewa Janik" & period == "2001-2005"))
posl_df <- rbind(posl_df, data.frame(AUTHOR = "Ewa Janik", period = "2001-2005", ugr = "Klub Parlamentarny Sojuszu Lewicy Demokratycznej"))
Check how many interpellations are not accounted for author’s
party
all_pi %>%
left_join(posl_df, by = c("AUTHOR", "period")) %>%
group_by(period) %>%
summarise(n_mem = n_distinct(AUTHOR), count = n(), without_party = sum(is.na(ugr)) / count,
n_mem_without_party = length(unique(AUTHOR[is.na(ugr)])), without_part_count = n_mem_without_party / n_mem) %>%
kable() %>%
kable_styling()
|
period
|
n_mem
|
count
|
without_party
|
n_mem_without_party
|
without_part_count
|
|
1997-2001
|
559
|
7444
|
0.1575766
|
276
|
0.4937388
|
|
2001-2005
|
687
|
10906
|
0.1444159
|
322
|
0.4687045
|
|
2005-2007
|
686
|
9724
|
0.2317976
|
317
|
0.4620991
|
- good news: between 1 in 7 and 1 in 5 interpollation is not accounted
for auther (not great, not terrible) (column
without_party)
- bad news: half of authors are not recognized (column
without_part_count)
Let see which authors do not rest in posel list
all_pi %>%
left_join(posl_df, by = c("AUTHOR", "period")) %>%
group_by(AUTHOR, ugr) %>%
summarise(k = n()) %>%
filter(is.na(ugr)) %>%
head() %>%
kable() %>%
kable_styling()
|
AUTHOR
|
ugr
|
k
|
|
Adam Bielan
|
NA
|
6
|
|
Adam Bielan i Zbigniew Ziobro
|
NA
|
4
|
|
Adam Jerzy Bielan
|
NA
|
5
|
|
Adam Markiewicz i Andrzej Otręba
|
NA
|
1
|
|
Adam Ołdakowski i Maria Zbyrowska
|
NA
|
1
|
|
Adam Ołdakowski i Józef Stępkowski
|
NA
|
31
|
As stated before
- member was sometimes described with first and second name (‘Adam
Bielan’ vs ‘Adam Jerzy Bielan’)
- couple of members cosign one interpellation (e.g. ‘Adam Bielan i
Zbigniew Ziobro’)
# create dict author + period
un_authors <- all_pi %>% select(AUTHOR, period) %>% unique()
un_authors <- un_authors %>% left_join(y = posl_df, by = c("AUTHOR", "period"))
To clean up that mess I’ll introduce:
- one manual change
- assume that first member that signs the interpellation is its
author
- if posel uses second name shorten it to first name and surname
# some manual changes
un_authors$AUTHOR[un_authors$AUTHOR == "łukasz Zbonikowski"] <- "Łukasz Zbonikowski"
# assumption - if multiple poeple sign a interplataion - assign it to the first on the list (most proboalby person responisbile for phrasing)
# most likely it will be Jan Kowalski i Zbigniew Nowak (so they are separated by small i)
# if posel uses second name - shorten it to first name and surname
un_authors$stripped <- lapply(X = un_authors$AUTHOR,
FUN = function(i){
if (i == "na") return(i)
gg <- strsplit(i, split = "(\\s|Senator|Poseł|Posel|Poslowie|Z należytym szacunkiem)")[[1]]
gg <- trimws(gg)
gg <- gg[nchar(gg) > 0]
ggs <- regexpr(pattern = "^[[:upper:]]", text = gg) > 0
first_zero <- which(!ggs)[1]
if (is.na(first_zero)) first_zero <- length(ggs) + 1
last_one <- max(which(ggs[1:(first_zero - 1)]))
paste0(gg[c(1, last_one)], collapse = " ")
}) %>% unlist()
# join it with party dictionairy
un_authors2 <- un_authors %>% left_join(y = posl_df %>% select(stripped = AUTHOR, period, ugr2 = ugr), by = c("stripped", "period"))
# repeat manual changes and join with text data and see if that he
all_pi$AUTHOR[all_pi$AUTHOR == "łukasz Zbonikowski"] <- "Łukasz Zbonikowski"
all_pi %>%
left_join(un_authors2 %>% select(AUTHOR, period, ugr), by = c("AUTHOR", "period")) %>%
left_join(un_authors2 %>% select(AUTHOR, stripped, period, ugr2) %>% unique(), by = c("AUTHOR", "period")) %>%
group_by(period) %>%
summarise(count_original = n_distinct(AUTHOR), count_clean = n_distinct(stripped), count_int = n(),
perc_without_original = sum(is.na(ugr)) / count_int, perc_without_clean = sum(is.na(ugr2)) / count_int) %>%
kable() %>%
kable_styling()
|
period
|
count_original
|
count_clean
|
count_int
|
perc_without_original
|
perc_without_clean
|
|
1997-2001
|
559
|
339
|
7444
|
0.1575766
|
0.0569586
|
|
2001-2005
|
687
|
435
|
10906
|
0.1444159
|
0.0790391
|
|
2005-2007
|
685
|
429
|
9752
|
0.2335931
|
0.0796760
|
After clean up, instead of 1 in 5 interpellations without an author
now I have only 1 in 12 missing.
all_pi <- all_pi %>%
left_join(un_authors2 %>% select(AUTHOR, AUTHOR_CLEAN = stripped, period, party = ugr2) %>% unique(), by = c("AUTHOR", "period"))
Clean stemmed data
The data was stem outsied the script - load it
# I've tagged already
pi_tagged_files <- paste0("tagged_", pi_files)
all_pi_tagged <- do.call(rbind,
lapply(X = pi_tagged_files,
FUN = function(i){
mydb <- dbConnect(RSQLite::SQLite(), file.path("data", i))
this_cont <- dbGetQuery(conn = mydb, statement = "select * from tagged")
dbDisconnect(mydb)
this_cont$period <- gsub("tagged_pi_", replacement = "", x = gsub(pattern = ".sqlite", replacement = "", x = i))
this_cont
}))
head(all_pi_tagged) %>%
kable() %>%
kable_styling()
|
ID
|
ORG
|
TAGGED
|
period
|
|
1
|
Na
|
na
|
1997-2001
|
|
1
|
początku
|
początek
|
1997-2001
|
|
1
|
bieżącego
|
bieżący
|
1997-2001
|
|
1
|
roku
|
rok
|
1997-2001
|
|
1
|
zwrócił
|
zwrócić
|
1997-2001
|
|
1
|
em
|
być
|
1997-2001
|
The data is stripped from whitespaces and lowercased (unless it is a
recognizable name). How many tags (stemmed entities) do I have?
n_distinct(all_pi_tagged$TAGGED)
## [1] 64840
Cleaning:
- remove words that correspond to dates
- remove polish stop words
- remove number (in several formats)
- remove dates (in several formats)
- remove words that are most likely in a greeting or in the signature
part of interpellation
# cleaning
all_pi_tagged_clean <- all_pi_tagged %>%
# remove part with date
anti_join(y = data.frame(TAGGED = c("dzień", "styczeń", "luty", "marzec", "kwiecień", "maj", "czerwiec", "lipiec", "sierpień", "wrzesień",
"październik", "listopad", "grudzień", "rok")), by = "TAGGED") %>%
# remove stopwords
anti_join(y = polish_stop_words_df("TAGGED"))
# remove numbers
all_pi_tagged_clean <- all_pi_tagged_clean[regexpr(pattern = "^\\d+$", text = all_pi_tagged_clean$TAGGED) < 0, ]
# %, +
all_pi_tagged_clean <- all_pi_tagged_clean[regexpr(pattern = "^\\W$", text = all_pi_tagged_clean$TAGGED) < 0, ]
# numbers with comma as decimal point
all_pi_tagged_clean <- all_pi_tagged_clean[regexpr(pattern = "^\\d+,\\d+$", text = all_pi_tagged_clean$TAGGED) < 0, ]
# numbers with dot as decimal point
all_pi_tagged_clean <- all_pi_tagged_clean[regexpr(pattern = "^\\d+\\.\\d+$", text = all_pi_tagged_clean$TAGGED) < 0, ]
# remove dates xx.xx.xxxx
all_pi_tagged_clean <- all_pi_tagged_clean[regexpr(pattern = "^\\d{1,2}\\.(0\\d{1}|\\d{1,2})\\.(\\d{2}|\\d{4})$", text = all_pi_tagged_clean$TAGGED) < 0, ]
# remove dates xx-xx-xxxx
all_pi_tagged_clean <- all_pi_tagged_clean[regexpr(pattern = "^\\d{1,2}-(0\\d{1}|\\d{1,2})-(\\d{2}|\\d{4})$", text = all_pi_tagged_clean$TAGGED) < 0, ]
# remove words like 'szanowny', 'minister', 'premier' because they come from a greeting at the beginnig of the interpellation or at the end ("poseł", "poważanie")
all_pi_tagged_clean <- all_pi_tagged_clean %>% filter(!TAGGED %in% c("szanowny", "minister", "premier", "poseł", "poważanie"))
How many tags are in the end?
un_tagged <- sort(unique(all_pi_tagged_clean$TAGGED))
length(un_tagged)
## [1] 56516
In order to incorporate Text Mining algorithms I’ll create documents
from processed stemmed data.
all_pi_tagged <- all_pi_tagged_clean %>%
group_by(period, ID) %>%
summarise(CONTENT = paste0(TAGGED, collapse = " "))
head(all_pi_tagged, 2) %>%
kable() %>%
kable_styling()
|
period
|
ID
|
CONTENT
|
|
1997-2001
|
1
|
początek bieżący zwrócić zdrowie sprawa przestrzegać przepis ustawa
dotyczyć dopuszczalność przerywać ciąża wskazać niektóry województwo
województwo kielecki obowiązywać ustawa kobieta chcieć skorzystać
przysługiwać prawo mieć możliwość wynikać fakt ginekolog publiczny
placówka opieka zdrowotny gremialnie odmawiać wykonać dopuszczalny
prawnie zabieg spełnić zainteresowany kobieta przewidzieć przepis wymóg
zaznaczyć przepis dawać określić prawo kobieta mieć obowiązek
uprawnienie gwarantować odpowiedź zdrowie zobowiązać podjąć działanie
zmierzać województwo kielecki doprowadzić prawidłowy funkcjonować ustawa
zgodzić chyba móc województwo mieć placówka kobieta móc liczyć prawo
uszanować niezrozumiały chóralny powoływać ginekolog kodeks etyka
lekarski często ginekolog wykonywać zabieg aborcja prywatnie zacisze
własny gabinet oczywiście odpowiedni opłata oceniać szczyt hipokryzja
pomijać kodeks zawodowy móc sytuować obowiązujący prawo podstawowy
problem brzmieć kwestia przysługiwać kobieta prawo rozwiązać zgodnie
przepis ustawa dotąd nikt przepis zmienić wydać wejście życie nowy
konstytucja orzeczenie trybunał konstytucyjny mieć moc stanowić mieć
sytuacja obowiązujący prawo przypadek prawo papier chcieć spytać zatem
raz działanie resort zdrowie podjąć podjąć przysługiwać kobieta prawo
przestrzegać wreszcie sprawa zostać rozwiązać gwarantować kobieta
minimum godność prosić konkretny informacja efekt dotychczasowy
działanie podjąć województwo kielecki ostatni czas zgłaszać biuro
poselski kobieta wskazywać prywatnie pieniądz zabieg móc mieć wykonać
jednocześnie lekarz wojewódzki stan wskazać placówka móc mieć wykonać
zabieg darmo uważać sytuacja wyjątkowo bulwersujący Władysław Adamski
Warszawa
|
|
1997-2001
|
2
|
trwać protest anestezjolog przeciwny polityka rząd RP dyskryminować
lekarz okręg gliwicki głodować anestezjolog szpital główny postulat
wyposażyć szpital sprzęt anestezjologiczny często stan użyć zagrażać
życie zdrowie pacjent wzrost wynagrodzenie lekarz anestezjolog
maksymalny stawka dany grupa zaszeregowanie plus premia kwota niski
średni krajowy lekarz wojewódzki A Sośnierz rozmowa głodować lekarz
wygospodarować pieniądz rozesłać dyrektor szpital niestety podwyżka
rezultat rząd złoty zdrowie lekarz podległy pacjent zagrozić sprawa
oczywisty wymagać natychmiastowy realizacja zwracać Jan Olszewski
Warszawa
|